#loading packages 
library(ezids)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(tibble)
library(dplyr)
library(tidyr)
library(psych)

#loading data 
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))

#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")
#converting temperature observations to numerical
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
# subset data to desired variables
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW)) 
#creating a subset for 1900 on
NYweath_00 <- subset(NYweath_sub, year > 1899)
xkabledplyhead(NYweath_00)
Head
DATE day month year TMAX TMIN TAVG PRCP SNOW
11265 1900-01-01 01 01 1900 20 14 NA 0.03 0.5
11266 1900-01-02 02 01 1900 23 13 NA 0.00 0.0
11267 1900-01-03 03 01 1900 26 19 NA 0.00 0.0
11268 1900-01-04 04 01 1900 32 19 NA 0.00 0.0
11269 1900-01-05 05 01 1900 39 29 NA 0.00 0.0

CW ADD: Adding a ‘TOT_PRCP’ row that sums up the total precipitation between SNOW and PRCP. This row will be used in Question 3.

NYweath_final <- NYweath_00
NYweath_final$TOT_PRCP <- NYweath_00$PRCP + NYweath_00$SNOW

Introduction

In this project, we are digging into the relationship between human activity and weather in New York city. Our three driving questions are:

  1. How do changes in NYC weather patterns correlate to changes in population and economic activity over the same time frame?

  2. How do changes in NYC weather patterns correlate to changes in other urban climate factors such as air quality?

  3. How do changes in weather patterns correlate to other local human activity, such as crime, reported COVID cases, and stock market performance?

Local weather and global and local human environmental footprint

Emily will re-do logistic regression looking at measures of local and global human activity as regressors rather than year. She will also look into variable transformations (i.e., linear models fit to polynomials of regressors) to see if the response is best fit as linear or polynomial.

Local weather and local environmental and climate factors

Tejas will look at air local air quality data and how it relates to precipitation trends and local human activities. (Will include logistic regression.)

Local weather and local human social and economic activity

Chris will look at the local weather data and how it affects human behavior. Will look for correlations between precipitation, temperature, stock market, COVID tests, and crime

First step is to Transform Precip to Yes/No Factor Var. Will use PRCP_TOT to account for all PRCP.

# Add a column to convert PRCP to a binary factor variable. Don't care how much it rains, only if it rains. 
NYweath_prcpFact <-NYweath_final

NYweath_prcpFact$PRCP_factor <- cut(NYweath_final$TOT_PRCP, c(-Inf,0, Inf), labels = c(0,1))
NYweath_prcpFact$PRCP_factor <- as.factor(NYweath_prcpFact$PRCP_factor)

Crime data

Initial import of the data. Due to the size of the data, I imported it once, aggregated the data into a new data frame that only includes date and arrest count. This was exported and saved in the Git. The code below imports directly from that aggregated dataset.

#NYcrime <- data.frame(read.csv("/Users/christopherwasher/Documents/DATS6101/NYPD_Arrests_Data__Historic_.csv"))



#NYcrime_agg <- NYcrime %>% count(ARREST_DATE)

NYcrime_count <- tibble(read.csv("./data/NYPD_Arrests_counts.csv"))

NYcrime_count$ARREST_DATE <- as.Date(NYcrime_count$ARREST_DATE, format = "%Y-%m-%d")
#NYcrime_count$day <- format(NYcrime_count$ARREST_DATE, format="%d")
#NYcrime_count$month <- format(NYcrime_count$ARREST_DATE, format="%m")
#NYcrime_count$year <- format(NYcrime_count$ARREST_DATE, format="%Y")

colnames(NYcrime_count)[2] <- "ARREST_DATE"

head(NYcrime_count)
## # A tibble: 6 × 6
##       X ARREST_DATE NUM_ARREST   day month  year
##   <int> <date>           <int> <int> <int> <int>
## 1     1 2006-01-01         551     1     1  2006
## 2     2 2007-01-01         589     1     1  2007
## 3     3 2008-01-01         769     1     1  2008
## 4     4 2009-01-01         764     1     1  2009
## 5     5 2010-01-01         958     1     1  2010
## 6     6 2011-01-01         900     1     1  2011

Now will do summary statistics and basic EDA on the Crime Count data

crime_plot <- plot(NYcrime_count$ARREST_DATE, NYcrime_count$NUM_ARREST)

crime_boxplot <- boxplot(NYcrime_count$NUM_ARREST)

Add the Crime data to the NY Weather Data, subsetting the weather data to after 1/1/2022

crimeWeather <- subset(NYweath_prcpFact, year >= 2006 & year < 2022)
NYcrime_count <- NYcrime_count[order(NYcrime_count$ARREST_DATE),]

tail(crimeWeather)
##             DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 55819 2021-12-26  26    12 2021   51   39   NA 0.00    0     0.00           0
## 55820 2021-12-27  27    12 2021   39   34   NA 0.09    0     0.09           1
## 55821 2021-12-28  28    12 2021   47   36   NA 0.05    0     0.05           1
## 55822 2021-12-29  29    12 2021   44   41   NA 0.14    0     0.14           1
## 55823 2021-12-30  30    12 2021   49   43   NA 0.05    0     0.05           1
## 55824 2021-12-31  31    12 2021   55   48   NA 0.00    0     0.00           0
NYweath_crime <- cbind(crimeWeather, NYcrime_count$NUM_ARREST)
colnames(NYweath_crime)[12] <- c("NUM_ARREST")

#NYweath_crime_plot <- plot(sqrt(NYweath_crime$PRCP), NYweath_crime$NUM_ARREST)
#boxplot((NYweath_crime$TOT_PRCP))

NY_weathcrime_ggplot <- ggplot(NYweath_crime, aes(x = TMAX, y =NUM_ARREST)) + 
  geom_point(aes(colour = PRCP_factor)) +
  labs(x = "Temperature", y = "Number of Daily Arrests", 
       title = "Weather Patterns for NYC Crime")
NY_weathcrime_ggplot

Initially made a boxplot of precipitation to observe the distribution.. It is extremely skewed. However, because I’m only interested in determining if precipitation has an effect, will build a linear model using PRCP as a Factor.

crimeWeath_lm <- lm(NUM_ARREST ~ TMAX + PRCP_factor,  
                    data = NYweath_crime)
crimeWeathMonth_lm <- lm(NUM_ARREST ~ (TMAX + PRCP_factor * month), 
                    data = NYweath_crime)
#crimeWeathTMIN_lm <- lm(NUM_ARREST ~ (TMIN + PRCP_factor), 
#                    data = NYweath_crime)

summary(crimeWeath_lm)
## 
## Call:
## lm(formula = NUM_ARREST ~ TMAX + PRCP_factor, data = NYweath_crime)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -791.0 -264.2  -35.2  302.3  866.8 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   896.018     17.234   51.99  < 2e-16 ***
## TMAX            0.522      0.256    2.04    0.041 *  
## PRCP_factor1  -58.766      9.655   -6.09  1.2e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 352 on 5841 degrees of freedom
## Multiple R-squared:  0.00711,    Adjusted R-squared:  0.00677 
## F-statistic: 20.9 on 2 and 5841 DF,  p-value: 9.01e-10

The Linear model of Arrest Numbers as a result of temperature and precipitation. The Coefficients are significant but the R^2 is 0. This indicates there is a statistically significant relationship between Arrests and TMAX and Precipitation but these variables do not explain any of the variability in the data. Increasing TMAX correlated with an increase in Arrests. And PRCP present is associated with a decreased number of arrests.

Stock Market Data

Import the stock market data and convert the date column to a ‘Date’ data type. Also pulled out the ‘day’, ‘month’, and ‘year’ columns to help in analysis.

One last note, will need to fill in the missing date and populated the other columns with ‘NAs’. This will enable us to combine the stocks data with the weather data.

NYstock <- tibble(read.csv("./data/Dow Jones Industrial Average Historical Data.csv"))

tail(NYstock)
## # A tibble: 6 × 7
##   Date     Price     Open      High      Low       Vol.    Change..
##   <chr>    <chr>     <chr>     <chr>     <chr>     <chr>   <chr>   
## 1 11/11/22 33,749.18 33,797.75 33,815.82 33,394.49 418.43M 0.11%   
## 2 11/14/22 33,537.16 33,645.97 33,963.62 33,534.27 339.05M -0.63%  
## 3 11/15/22 33,594.17 33,804.97 33,984.90 33,321.26 380.64M 0.17%   
## 4 11/16/22 33,556.60 33,554.93 33,682.39 33,521.23 288.59M -0.11%  
## 5 11/17/22 33,547.37 33,277.00 33,615.82 33,245.78 301.53M -0.03%  
## 6 11/18/22 33,747.14 33,606.59 33,827.83 33,541.07 296.37M 0.60%
NYstock$Date <- as.Date(NYstock$Date, format = "%m/%d/%y")

NYstock2 <- NYstock
NYstock2 <- NYstock2 %>% 
  complete(Date = seq.Date(min(Date), max(Date), by="day"))

options(scientific=T, digits = 10) 


# This is all just test code for figuring out how to clean the data. 
# Not part of final script.
#NYstocktest <- NYstock2
#NYstocktest$Vol. = substr(NYstocktest$Vol.,1,nchar(NYstocktest$Vol.)-1)
#tail(NYstocktest)


#NYstocktest$Price <- gsub(",", "", NYstocktest$Price)
#NYstocktest[3:5] <- lapply(NYstocktest[3:5], gsub, pattern = ",", replacement = "") 
#NYstocktest$Change.. <- gsub("%", "", NYstocktest$Change..)
#NYstocktest[2:7] <- sapply(NYstocktest[2:7], as.numeric)
###

NYstock2$Vol. = substr(NYstock2$Vol., 1, nchar(NYstock2$Vol.) - 1)
NYstock2[2:5] <- lapply(NYstock2[2:5], gsub, pattern = ",", replacement = "") 
NYstock2$Change.. <- gsub("%", "", NYstock2$Change..)
NYstock2[2:7] <- sapply(NYstock2[2:7], as.numeric)

NYstock2$day <- format(NYstock2$Date, format="%d")
NYstock2$month <- format(NYstock2$Date, format="%m")
NYstock2$year <- format(NYstock2$Date, format="%Y")



head(NYstock2)
## # A tibble: 6 × 10
##   Date       Price  Open  High   Low  Vol. Change.. day   month year 
##   <date>     <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl> <chr> <chr> <chr>
## 1 1979-12-25  839.  839.  842.  833.    NA     0    25    12    1979 
## 2 1979-12-26  838.  839.  843.  834.    NA    -0.12 26    12    1979 
## 3 1979-12-27  840.  838.  843.  834.    NA     0.23 27    12    1979 
## 4 1979-12-28  839.  840.  843.  835.    NA    -0.14 28    12    1979 
## 5 1979-12-29   NA    NA    NA    NA     NA    NA    29    12    1979 
## 6 1979-12-30   NA    NA    NA    NA     NA    NA    30    12    1979
summary(NYstock2)
##       Date                Price               Open          
##  Min.   :1979-12-25   Min.   :  759.13   Min.   :  759.980  
##  1st Qu.:1990-09-15   1st Qu.: 2733.83   1st Qu.: 2733.387  
##  Median :2001-06-06   Median : 9447.96   Median : 9438.680  
##  Mean   :2001-06-06   Mean   :10214.49   Mean   :10212.138  
##  3rd Qu.:2012-02-26   3rd Qu.:13171.78   3rd Qu.:13170.677  
##  Max.   :2022-11-18   Max.   :36799.65   Max.   :36722.600  
##                       NA's   :4848       NA's   :4848       
##       High                Low                 Vol.         
##  Min.   :  767.410   Min.   :  729.950   Min.   :  1.5900  
##  1st Qu.: 2747.680   1st Qu.: 2715.142   1st Qu.: 40.3575  
##  Median : 9521.945   Median : 9348.660   Median :152.8850  
##  Mean   :10274.204   Mean   :10147.937   Mean   :168.8159  
##  3rd Qu.:13242.507   3rd Qu.:13098.073   3rd Qu.:257.3475  
##  Max.   :36952.650   Max.   :36636.000   Max.   :922.6800  
##  NA's   :4848        NA's   :4848        NA's   :6878      
##     Change..              day               month               year          
##  Min.   :-22.610000   Length:15670       Length:15670       Length:15670      
##  1st Qu.: -0.460000   Class :character   Class :character   Class :character  
##  Median :  0.050000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :  0.040395                                                           
##  3rd Qu.:  0.570000                                                           
##  Max.   : 11.370000                                                           
##  NA's   :4848

Really only care about the volume of data. will remove all other columns and only work with Date + Vol. Will combine that witht he weather data for further analysis.

NYstock_final <- NYstock2[,c("Date", "Vol.")]
NYstock_final <- subset(NYstock_final, Date <= "2022-09-26")
weather_stockDates <- subset(NYweath_prcpFact, DATE >= "1979-12-25")

stockWeather <- cbind(weather_stockDates, NYstock_final)
colnames(stockWeather)[13] <- c("Volume")

Now will do EDA on the the volume data to build a linear regression model. First will look at normality and look for any correlations.

stockWeather_rmNA <- subset(stockWeather, !is.na(Volume))
stock_hist <- hist(stockWeather_rmNA$Volume)

Histogram shows the data is right skewed. Will use sqrt of the volume to normalize.

stockWeather_rmNA$Volume_norm <- sqrt(stockWeather_rmNA$Volume)
stockWeather_rmNA <- subset(stockWeather_rmNA, select = -c(Date))
stockWeather_90s <- subset(stockWeather_rmNA, year >= 1988 & year <= 1999)

hist(stockWeather_rmNA$Volume_norm)

boxplot(stockWeather_rmNA$Volume_norm)

The distribution of sqrt Volume is considerably more normal. Will now look at correlations with Weather data. The boxplot shows there are no outliers after normalizing the data.

pairs.panels(stockWeather_rmNA[c("TMAX", "TOT_PRCP","PRCP_factor",
                                 "Volume","Volume_norm")], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = FALSE,  # show density plots
             ellipses = FALSE # show correlation ellipses
             )

There are no strong correlations present in the data. Will next look at a scatter plot of the Stock Volume vs TMAX, categorized by days with PRCP.

NY_weathstock_scatter <- ggplot(stockWeather_rmNA, aes(x = TMAX, y =Volume_norm)) + 
  geom_point(aes(colour = PRCP_factor)) +
  labs(x = "Temperature", y = "Total Daily DOW Trade Volume", 
       title = "Weather Patterns for DOW Trade Volume")
NY_weathstock_scatter

## Trying again with the 90's stock data.

NY_90s_weathstock_scatter <- ggplot(stockWeather_90s, aes(x = TMAX, y =Volume_norm)) + 
  geom_point(aes(colour = PRCP_factor)) +
  labs(x = "Temperature", y = "Total Daily DOW Trade Volume", 
       title = "Weather Patterns for DOW Trade Volume in the 1990s")
NY_90s_weathstock_scatter

stock_LM <- lm(Volume_norm ~ TMAX + TOT_PRCP, stockWeather_rmNA)
summary(stock_LM)
## 
## Call:
## lm(formula = Volume_norm ~ TMAX + TOT_PRCP, data = stockWeather_rmNA)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -10.6329998  -5.4951601   0.5605502   4.1901861  18.3633455 
## 
## Coefficients:
##                Estimate  Std. Error  t value   Pr(>|t|)    
## (Intercept) 12.48308116  0.21583691 57.83571 < 2.22e-16 ***
## TMAX        -0.01120901  0.00323609 -3.46375 0.00053525 ***
## TOT_PRCP     0.05385785  0.07405008  0.72732 0.46705155    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.390441 on 8750 degrees of freedom
## Multiple R-squared:  0.001550555,    Adjusted R-squared:  0.001322338 
## F-statistic: 6.794215 on 2 and 8750 DF,  p-value: 0.001126157
stock_LM_90s <- lm(Volume_norm ~ TMAX + TOT_PRCP , stockWeather_90s)
summary(stock_LM_90s)
## 
## Call:
## lm(formula = Volume_norm ~ TMAX + TOT_PRCP, data = stockWeather_90s)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.5061465 -1.3084280 -0.6047107  0.8857008 10.7204436 
## 
## Coefficients:
##                 Estimate   Std. Error  t value Pr(>|t|)    
## (Intercept)  5.992519579  0.128703406 46.56069  < 2e-16 ***
## TMAX        -0.003602446  0.001930036 -1.86652 0.062066 .  
## TOT_PRCP    -0.102963402  0.050345137 -2.04515 0.040926 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.882202 on 3030 degrees of freedom
## Multiple R-squared:  0.002192332,    Adjusted R-squared:  0.001533713 
## F-statistic:  3.32868 on 2 and 3030 DF,  p-value: 0.0359715

The Linear Model that incorporates all the Stock data from 1988-present had a statistically significant

COVID Data

Looking at the effect of precipitation and temperature on the number of positive COVID cases. Using the “CASE_COUNT” parameter for the NYC Covid dataset. CASE_COUNT represents the count of patients tested who were confirmed to be COVID-19 cases on date_of_interest

options(scientific=T, digits = 3) 
NYcovid <- tibble(read.csv("./data/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv"))
options()
## $add.smooth
## [1] TRUE
## 
## $ambiguousMethodSelection
## function (cond) 
## NULL
## <environment: R_EmptyEnv>
## 
## $bitmapType
## [1] "quartz"
## 
## $browser
## [1] "/usr/bin/open"
## 
## $browserNLdisabled
## [1] FALSE
## 
## $callr.condition_handler_cli_message
## function (msg) 
## {
##     custom_handler <- getOption("cli.default_handler")
##     if (is.function(custom_handler)) {
##         custom_handler(msg)
##     }
##     else {
##         cli_server_default(msg)
##     }
## }
## <bytecode: 0x13b5e98b8>
## <environment: namespace:cli>
## 
## $CBoundsCheck
## [1] FALSE
## 
## $check.bounds
## [1] FALSE
## 
## $citation.bibtex.max
## [1] 1
## 
## $continue
## [1] "+ "
## 
## $contrasts
##         unordered           ordered 
## "contr.treatment"      "contr.poly" 
## 
## $cpp11_preserve_xptr
## <pointer: 0x12a369b30>
## 
## $defaultPackages
## [1] "datasets"  "utils"     "grDevices" "graphics"  "stats"     "methods"  
## 
## $demo.ask
## [1] "default"
## 
## $deparse.cutoff
## [1] 60
## 
## $device
## function (width = 7, height = 7, ...) 
## {
##     grDevices::pdf(NULL, width, height, ...)
## }
## <bytecode: 0x11b57d448>
## <environment: namespace:knitr>
## 
## $device.ask.default
## [1] FALSE
## 
## $digits
## [1] 3
## 
## $dplyr.show_progress
## [1] TRUE
## 
## $dvipscmd
## [1] "dvips"
## 
## $echo
## [1] FALSE
## 
## $editor
## [1] "vi"
## 
## $encoding
## [1] "native.enc"
## 
## $example.ask
## [1] "default"
## 
## $expressions
## [1] 5000
## 
## $help.search.types
## [1] "vignette" "demo"     "help"    
## 
## $help.try.all.packages
## [1] FALSE
## 
## $htmltools.preserve.raw
## [1] TRUE
## 
## $HTTPUserAgent
## [1] "R (4.2.2 aarch64-apple-darwin20 aarch64 darwin20)"
## 
## $httr_oauth_cache
## [1] NA
## 
## $httr_oob_default
## [1] FALSE
## 
## $install.packages.compile.from.source
## [1] "interactive"
## 
## $internet.info
## [1] 2
## 
## $keep.parse.data
## [1] TRUE
## 
## $keep.parse.data.pkgs
## [1] FALSE
## 
## $keep.source
## [1] FALSE
## 
## $keep.source.pkgs
## [1] FALSE
## 
## $knitr.in.progress
## [1] TRUE
## 
## $knitr.table.format
## [1] "html"
## 
## $lme4.summary.cor.max
## [1] 12
## 
## $locatorBell
## [1] TRUE
## 
## $mailer
## [1] "mailto"
## 
## $matprod
## [1] "default"
## 
## $max.print
## [1] 99999
## 
## $menu.graphics
## [1] TRUE
## 
## $na.action
## [1] "na.omit"
## 
## $nloptr.show.inequality.warning
## [1] TRUE
## 
## $nwarnings
## [1] 50
## 
## $OutDec
## [1] "."
## 
## $pager
## [1] "/Library/Frameworks/R.framework/Resources/bin/pager"
## 
## $papersize
## [1] "a4"
## 
## $PCRE_limit_recursion
## [1] NA
## 
## $PCRE_study
## [1] FALSE
## 
## $PCRE_use_JIT
## [1] TRUE
## 
## $pdfviewer
## [1] "/usr/bin/open"
## 
## $pkgType
## [1] "both"
## 
## $printcmd
## [1] "lpr"
## 
## $prompt
## [1] "> "
## 
## $repos
##     CRAN 
## "@CRAN@" 
## 
## $rl_word_breaks
## [1] " \t\n\"\\'`><=%;,|&{()}"
## 
## $scientific
## [1] TRUE
## 
## $scipen
## [1] 0
## 
## $show.coef.Pvalues
## [1] TRUE
## 
## $show.error.messages
## [1] TRUE
## 
## $show.signif.stars
## [1] TRUE
## 
## $showErrorCalls
## [1] TRUE
## 
## $str
## $str$strict.width
## [1] "no"
## 
## $str$digits.d
## [1] 3
## 
## $str$vec.len
## [1] 4
## 
## $str$list.len
## [1] 99
## 
## $str$deparse.lines
## NULL
## 
## $str$drop.deparse.attr
## [1] TRUE
## 
## $str$formatNum
## function (x, ...) 
## format(x, trim = TRUE, drop0trailing = TRUE, ...)
## <environment: 0x13b7b0920>
## 
## 
## $str.dendrogram.last
## [1] "`"
## 
## $stringsAsFactors
## [1] FALSE
## 
## $texi2dvi
## [1] "/opt/R/arm64/bin/texi2dvi"
## 
## $tikzMetricsDictionary
## [1] "Chris_analysis-tikzDictionary"
## 
## $timeout
## [1] 60
## 
## $try.outFile
## A connection with                            
## description "output"        
## class       "textConnection"
## mode        "wr"            
## text        "text"          
## opened      "opened"        
## can read    "no"            
## can write   "yes"           
## 
## $ts.eps
## [1] 1e-05
## 
## $ts.S.compat
## [1] FALSE
## 
## $unzip
## [1] "/usr/bin/unzip"
## 
## $useFancyQuotes
## [1] FALSE
## 
## $verbose
## [1] FALSE
## 
## $warn
## [1] 0
## 
## $warning.length
## [1] 1000
## 
## $width
## [1] 80
NYcovid <- select(NYcovid, 1:3)

head(NYcovid)
## # A tibble: 6 × 3
##   date_of_interest CASE_COUNT PROBABLE_CASE_COUNT
##   <chr>                 <int>               <int>
## 1 02/29/2020                1                   0
## 2 03/01/2020                0                   0
## 3 03/02/2020                0                   0
## 4 03/03/2020                1                   0
## 5 03/04/2020                5                   0
## 6 03/05/2020                3                   0
colnames(NYcovid)[1] <- "DATE"

NYcovid$DATE <- as.Date(NYcovid$DATE, format = "%m/%d/%Y")
NYcovid$day <- format(NYcovid$DATE, format="%d")
NYcovid$month <- format(NYcovid$DATE, format="%m")
NYcovid$year <- format(NYcovid$DATE, format="%Y")

head(NYcovid)
## # A tibble: 6 × 6
##   DATE       CASE_COUNT PROBABLE_CASE_COUNT day   month year 
##   <date>          <int>               <int> <chr> <chr> <chr>
## 1 2020-02-29          1                   0 29    02    2020 
## 2 2020-03-01          0                   0 01    03    2020 
## 3 2020-03-02          0                   0 02    03    2020 
## 4 2020-03-03          1                   0 03    03    2020 
## 5 2020-03-04          5                   0 04    03    2020 
## 6 2020-03-05          3                   0 05    03    2020
summary(NYcovid)
##       DATE              CASE_COUNT    PROBABLE_CASE_COUNT     day           
##  Min.   :2020-02-29   Min.   :    0   Min.   :   0        Length:991        
##  1st Qu.:2020-11-02   1st Qu.:  574   1st Qu.:  76        Class :character  
##  Median :2021-07-08   Median : 1437   Median : 343        Mode  :character  
##  Mean   :2021-07-08   Mean   : 2534   Mean   : 478                          
##  3rd Qu.:2022-03-12   3rd Qu.: 2796   3rd Qu.: 644                          
##  Max.   :2022-11-15   Max.   :54947   Max.   :5882                          
##     month               year          
##  Length:991         Length:991        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Next, Looked at normality of the COVID count data. The counts were extremely skewed to the right. First removed multiple rounds of outliers using the outlierKD2 funciton. After removing all outliers, the data was still skewed right but less extreme. Used a square-root transform to normalize the data.

covid_plot <- plot(NYcovid$DATE, NYcovid$CASE_COUNT)

covid_boxplot <- boxplot(NYcovid$CASE_COUNT)

NYcovid_rmOuts <- outlierKD2(NYcovid, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 42 
## Proportion (%) of outliers: 4.4 
## Mean of the outliers: 21841 
## Mean without removing outliers: 2534 
## Mean if we remove outliers: 1680 
## Outliers successfully removed
NYcovid_rmOuts2 <- outlierKD2(NYcovid_rmOuts, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 15 
## Proportion (%) of outliers: 1.6 
## Mean of the outliers: 5696 
## Mean without removing outliers: 1680 
## Mean if we remove outliers: 1615 
## Outliers successfully removed
NYcovid_rmOuts3 <- outlierKD2(NYcovid_rmOuts2, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 7 
## Proportion (%) of outliers: 0.8 
## Mean of the outliers: 5200 
## Mean without removing outliers: 1615 
## Mean if we remove outliers: 1588 
## Outliers successfully removed
NYcovid_rmOuts4 <- outlierKD2(NYcovid_rmOuts3, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 2 
## Proportion (%) of outliers: 0.2 
## Mean of the outliers: 5094 
## Mean without removing outliers: 1588 
## Mean if we remove outliers: 1581 
## Outliers successfully removed
covid_plot <- plot(NYcovid_rmOuts4$DATE, NYcovid_rmOuts4$CASE_COUNT)
tail(NYcovid_rmOuts3)
## # A tibble: 6 × 6
##   DATE       CASE_COUNT PROBABLE_CASE_COUNT day   month year 
##   <date>          <int>               <int> <chr> <chr> <chr>
## 1 2022-11-10       1957                 621 10    11    2022 
## 2 2022-11-11       1580                 395 11    11    2022 
## 3 2022-11-12       1477                 539 12    11    2022 
## 4 2022-11-13       1332                 514 13    11    2022 
## 5 2022-11-14       2861                 800 14    11    2022 
## 6 2022-11-15       2120                 652 15    11    2022
sqrt_count <- sqrt(NYcovid_rmOuts3$CASE_COUNT)
#hist(sqrt_count)

NYcovid_final <- cbind(NYcovid_rmOuts4, sqrt_count)
head(NYcovid_final)
##         DATE CASE_COUNT PROBABLE_CASE_COUNT day month year sqrt_count
## 1 2020-02-29          1                   0  29    02 2020       1.00
## 2 2020-03-01          0                   0  01    03 2020       0.00
## 3 2020-03-02          0                   0  02    03 2020       0.00
## 4 2020-03-03          1                   0  03    03 2020       1.00
## 5 2020-03-04          5                   0  04    03 2020       2.24
## 6 2020-03-05          3                   0  05    03 2020       1.73

Add the Covid data to the NY Weather Data, subsetting the weather data to after 2/29/2022

covWeather <- subset(NYweath_prcpFact, DATE >= ("2020-02-29"))
NYcovid_finaldates <- subset(NYcovid_final, DATE <= "2022-09-26")
tail(covWeather)
##             DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 56088 2022-09-21  21    09 2022   80   62   NA 0.00    0     0.00           0
## 56089 2022-09-22  22    09 2022   75   57   NA 0.47    0     0.47           1
## 56090 2022-09-23  23    09 2022   63   51   NA 0.00    0     0.00           0
## 56091 2022-09-24  24    09 2022   69   49   NA 0.00    0     0.00           0
## 56092 2022-09-25  25    09 2022   72   59   NA 1.11    0     1.11           1
## 56093 2022-09-26  26    09 2022   74   58   NA 0.00    0     0.00           0
NYweath_prcpCov <- cbind(covWeather, NYcovid_finaldates$CASE_COUNT,
                         NYcovid_finaldates$sqrt_count)
colnames(NYweath_prcpCov)[12:13] <- c("CASE_COUNT", "sqrt_count")

covCount_prcp_plot <- plot(NYweath_prcpCov$sqrt_count, sqrt(NYweath_prcpCov$PRCP))

Plot of COV case count vs precipitation. no apparent relationship, however, more interested in effect of precip not so much about the correlation in prcp

T-test comparing Covid positive counts on days with precipitation vs days without prcp.

cov_prcp1 <- subset(NYweath_prcpCov, PRCP_factor == 1)
cov_prcp0 <- subset(NYweath_prcpCov, PRCP_factor == 0)



cov_count_ttest <- t.test(cov_prcp0$sqrt_count, cov_prcp1$sqrt_count)
cov_count_ttest
## 
##  Welch Two Sample t-test
## 
## data:  cov_prcp0$sqrt_count and cov_prcp1$sqrt_count
## t = 0.2, df = 650, p-value = 0.8
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.02  2.49
## sample estimates:
## mean of x mean of y 
##      36.3      36.1
cov_count_bplot <- ggplot()+
  geom_boxplot(data = NYweath_prcpCov, aes(y = sqrt_count, x = PRCP_factor)) +
  labs(title = "COVID Positive Counts")

cov_count_bplot

## Repeating this EDA looking only at Covid cases from 2021+. 
cov_2021 <- subset(NYweath_prcpCov, year >= 2021)

cov_2021count_bplot <- ggplot()+
  geom_boxplot(data = cov_2021, aes(y = sqrt_count, x = PRCP_factor)) +
  labs(title = "2021 COVID Positive Counts")
cov_2021count_bplot

cov_2021prcp1 <- subset(cov_2021, PRCP_factor == 1)
cov_2021prcp0 <- subset(cov_2021, PRCP_factor == 0)

cov_2021count_ttest <- t.test(cov_2021prcp0$sqrt_count, cov_2021prcp1$sqrt_count)
cov_2021count_ttest
## 
##  Welch Two Sample t-test
## 
## data:  cov_2021prcp0$sqrt_count and cov_2021prcp1$sqrt_count
## t = 1, df = 414, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.06  4.10
## sample estimates:
## mean of x mean of y 
##      40.2      38.7

No significant difference in the mean Covid case counts on days with precipitation or without. However, there was a greater difference in the means when only incorporating Covid from 2021+.

NYweath_cov_final <- NYweath_prcpCov[,c(1:5, 8, 10:13)]

covWeath_final_scatter <- ggplot(NYweath_cov_final, 
                                 aes(x = TMAX, y =sqrt_count)) + 
  geom_point(aes(shape = PRCP_factor, colour = month)) +
  labs(x = "Temperature", 
       y = "Square Root of Total Daily DOW Trade Volume", 
       title = "Weather Patterns for Covid Case Counts")
covWeath_final_scatter

Will now build a linear model that incorporates temperature, precipitation, and Month to predict Covid counts.

library(psych)


pairs.panels(NYweath_cov_final[4:10], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = FALSE,  # show density plots
             ellipses = FALSE # show correlation ellipses
             )

cov_weathLM <- lm(sqrt_count ~ TMAX + PRCP_factor ,
                  data = NYweath_cov_final)
summary(cov_weathLM)
## 
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor, data = NYweath_cov_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45.36 -12.66  -1.68  12.03  39.53 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    57.068      2.205   25.88   <2e-16 ***
## TMAX           -0.306      0.031   -9.85   <2e-16 ***
## PRCP_factor1   -0.576      1.090   -0.53      0.6    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.5 on 874 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:   0.1,   Adjusted R-squared:  0.0979 
## F-statistic: 48.5 on 2 and 874 DF,  p-value: <2e-16

Summary of Key Findings

Conclusion